Libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 3.0.1 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.2
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(knitr)
library(cowplot) # for plot_grid and draw_label
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
library(RColorBrewer)
library(pander) # to make session info
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(snakecase)
library(corrr)
Config
analysis_modifier <- "main"
inputs_dir <- "inputs"
data_file <- tibble(files = list.files(inputs_dir)) %>%
arrange(desc(files)) %>%
top_n(1, wt = files) %>%
pull(files)
print(data_file)
## [1] "survey_data.2020-07-29_11.35.40.txt"
Settings
min_genes_gt_0 <- 100
Functions
is_outlier <- function(x) {
(x > quantile(x, 0.75) + 1.5*IQR(x)) |
(x < quantile(x, 0.25) - 1.5*IQR(x))
}
most_common_val_w_percent <- function(value_vector = c("a", "a", "b")){
mc_vals <- tabyl(value_vector) %>%
arrange(desc(n)) %>%
top_n(1, n)
paste0(round(100*mc_vals$percent), "% ", mc_vals$value_vector)
}
# StatBin2 allows depiction of empty bins as blank instead of a horizontal line:
# https://stackoverflow.com/questions/57128090/remove-baseline-color-for-geom-histogram
StatBin2 <- ggproto(
"StatBin2",
StatBin,
compute_group = function (data, scales, binwidth = NULL, bins = NULL,
center = NULL, boundary = NULL,
closed = c("right", "left"), pad = FALSE,
breaks = NULL, origin = NULL, right = NULL,
drop = NULL, width = NULL) {
if (!is.null(breaks)) {
if (!scales$x$is_discrete()) {
breaks <- scales$x$transform(breaks)
}
bins <- ggplot2:::bin_breaks(breaks, closed)
}
else if (!is.null(binwidth)) {
if (is.function(binwidth)) {
binwidth <- binwidth(data$x)
}
bins <- ggplot2:::bin_breaks_width(scales$x$dimension(), binwidth,
center = center, boundary = boundary,
closed = closed)
}
else {
bins <- ggplot2:::bin_breaks_bins(scales$x$dimension(), bins,
center = center, boundary = boundary,
closed = closed)
}
res <- ggplot2:::bin_vector(data$x, bins, weight = data$weight, pad = pad)
# drop 0-count bins completely before returning the dataframe
res <- res[res$count > 0, ]
res
})
Import data
col_spec <- cols(
.default = col_double(),
dataset_id = col_character(),
disease = col_character(),
pedaya = col_character(),
race = col_character(),
ethnicity = col_character(),
gender = col_character(),
cohort_name = col_character(),
cohort_code = col_character(),
doi_link = col_character(),
source_repository = col_character(),
repository_cohort_accession = col_character(),
repository_dataset_accession = col_character()
)
counts_and_meta_raw <- read_tsv(file.path(inputs_dir, data_file), col_types = col_spec)
counts_and_meta <- counts_and_meta_raw %>%
group_by(cohort_code) %>%
mutate(n_in_cohort = n())
write_csv(counts_and_meta, "results/Table_S1.csv")
Colors
# brewer.pal(8, "Paired")
these_colors <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C",
"#FDBF6F", "#FF7F00")
# light blue, dark blue, etc: green, red, orange (then purple and brown)
these_colors <- c("#1F78B4", "#1F78B4", "#33A02C", "#33A02C", "#E31A1C", "#E31A1C",
"#FF7F00", "#FF7F00")
names(these_colors) <- c("not assigned", "Total reads", "Non-exonic reads", "Exonic reads", "Duplicate reads", "Non-duplicate reads", "Unmapped reads", "Mapped reads")
these_colors_with_MEND = c(these_colors, "MEND reads"="#000000")
Datasets analyzed
nrow(counts_and_meta)
## [1] 2179
Diseases in dataset
counts_and_meta %>% tabyl(disease)
## disease n percent
## acute lymphoblastic leukemia 680 0.3120697568
## acute megakaryoblastic leukemia 103 0.0472693896
## acute myeloid leukemia 221 0.1014226709
## acute undifferentiated leukemia 1 0.0004589261
## adrenocortical adenoma 1 0.0004589261
## adrenocortical cancer 2 0.0009178522
## adrenocortical carcinoma 20 0.0091785223
## alveolar rhabdomyosarcoma 40 0.0183570445
## cholangiocarcinoma 1 0.0004589261
## choroid plexus carcinoma 25 0.0114731528
## desmoplastic small round cell tumor 4 0.0018357045
## dysembryoplastic neuroepithelial tumor 13 0.0059660395
## embryonal rhabdomyosarcoma 42 0.0192748967
## embryonal tumor with multilayered rosettes 1 0.0004589261
## ependymoma 98 0.0449747591
## epithelioid sarcoma 1 0.0004589261
## Ewing sarcoma 70 0.0321248279
## fibrolamellar hepatocellular carcinoma 3 0.0013767783
## fibromatosis 1 0.0004589261
## gastrointestinal stromal tumor 4 0.0018357045
## germ cell tumor 1 0.0004589261
## glioblastoma multiforme 29 0.0133088573
## glioma 193 0.0885727398
## hepatoblastoma 1 0.0004589261
## hepatocellular carcinoma 3 0.0013767783
## hypereosinophillic syndrome 1 0.0004589261
## juvenile myelomonocytic leukemia 1 0.0004589261
## leukemia 1 0.0004589261
## lymphoma 49 0.0224873795
## medulloblastoma 201 0.0922441487
## melanoma 7 0.0032124828
## melanotic neuroectodermal tumor 1 0.0004589261
## meningioma 1 0.0004589261
## myofibromatosis 2 0.0009178522
## nasopharyngeal carcinoma 2 0.0009178522
## neuroblastoma 12 0.0055071134
## neuroendocrine carcinoma 1 0.0004589261
## not reported 2 0.0009178522
## not reported - QC fail by PI 1 0.0004589261
## osteosarcoma 157 0.0720513997
## ovarian serous cystadenocarcinoma 1 0.0004589261
## PEComa 1 0.0004589261
## pleuropulmonary blastoma 1 0.0004589261
## retinoblastoma 2 0.0009178522
## rhabdoid tumor 65 0.0298301973
## rhabdomyosarcoma 53 0.0243230840
## spindle cell/sclerosing rhabdomyosarcoma 3 0.0013767783
## supratentorial embryonal tumor NOS 18 0.0082606700
## synovial sarcoma 22 0.0100963745
## undifferentiated sarcoma NOS 3 0.0013767783
## undifferentiated spindle cell sarcoma 2 0.0009178522
## wilms tumor 11 0.0050481872
disease_freq_table <- counts_and_meta$disease %>%
str_to_sentence %>%
str_replace(" nos$", " NOS") %>%
as.factor %>%
fct_infreq %>%
fct_lump(prop=0.01) %>%
tabyl (var1 = "Disease") %>%
adorn_pct_formatting(digits = 1)
colnames(disease_freq_table)[1]="Disease"
write_tsv(disease_freq_table, "results/disease_freq_table.tsv")
disease_freq_table %>%
kable %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
Disease
|
n
|
percent
|
|
Acute lymphoblastic leukemia
|
680
|
31.2%
|
|
Acute myeloid leukemia
|
221
|
10.1%
|
|
Medulloblastoma
|
201
|
9.2%
|
|
Glioma
|
193
|
8.9%
|
|
Osteosarcoma
|
157
|
7.2%
|
|
Acute megakaryoblastic leukemia
|
103
|
4.7%
|
|
Ependymoma
|
98
|
4.5%
|
|
Ewing sarcoma
|
70
|
3.2%
|
|
Rhabdoid tumor
|
65
|
3.0%
|
|
Rhabdomyosarcoma
|
53
|
2.4%
|
|
Lymphoma
|
49
|
2.2%
|
|
Embryonal rhabdomyosarcoma
|
42
|
1.9%
|
|
Alveolar rhabdomyosarcoma
|
40
|
1.8%
|
|
Glioblastoma multiforme
|
29
|
1.3%
|
|
Choroid plexus carcinoma
|
25
|
1.1%
|
|
Synovial sarcoma
|
22
|
1.0%
|
|
Other
|
131
|
6.0%
|
# many are leukemias
table(str_detect(counts_and_meta$disease, "leukemia"))
##
## FALSE TRUE
## 1172 1007
Cohorts in dataset
old_cohort_size_histogram <- ggplot(counts_and_meta) +
geom_histogram(aes(n_in_cohort, group=cohort_code),
fill = "lightgrey", color = "black", stat = StatBin2) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) + xlab("Cohort size") +
ylab("Cohorts") +
theme(legend.position="none")
# max(table(counts_and_meta$n_in_cohort)/as.numeric(names(table(counts_and_meta$n_in_cohort))))
cohort_hist_bin_size <- 10
cohort_size_histogram_raw <- ggplot(counts_and_meta %>%
select(cohort_code, n_in_cohort) %>%
distinct ) +
geom_histogram(aes(n_in_cohort),
fill = "lightgrey", color = "black", stat = StatBin2,
breaks = seq(0,500, by=10)) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
xlab("Cohort size") +
theme(legend.position="none")
max_y_val <- max(ggplot_build(cohort_size_histogram_raw)$data[[1]][,"y"])
cohort_size_histogram <- cohort_size_histogram_raw +
scale_y_continuous("Cohorts", breaks = seq(0, ceiling(max_y_val/2) * 2, 2))
cohort_size_histogram

n_distinct(counts_and_meta$cohort_code)
## [1] 48
sort(unique(counts_and_meta$n_in_cohort))
## [1] 3 4 6 7 8 10 15 17 19 22 23 24 25 26 31 35 39
## [18] 43 44 54 56 62 63 65 78 82 84 88 89 103 127 137 141 337
counts_and_meta %>%
select(cohort_code, n_in_cohort) %>%
distinct %>%
arrange(desc(n_in_cohort)) %>%
kable
|
cohort_code
|
n_in_cohort
|
|
Cohort_1
|
337
|
|
Cohort_2
|
141
|
|
Cohort_3
|
137
|
|
Cohort_4
|
127
|
|
Cohort_5
|
103
|
|
Cohort_6
|
89
|
|
Cohort_7
|
88
|
|
Cohort_8
|
84
|
|
Cohort_9
|
82
|
|
Cohort_10
|
78
|
|
Cohort_11
|
65
|
|
Cohort_12
|
63
|
|
Cohort_13
|
62
|
|
Cohort_14
|
56
|
|
Cohort_15
|
54
|
|
Cohort_16
|
44
|
|
Cohort_17
|
43
|
|
Cohort_18
|
39
|
|
Cohort_19
|
35
|
|
Cohort_20
|
31
|
|
Cohort_21
|
31
|
|
Cohort_22
|
26
|
|
Cohort_23
|
25
|
|
Cohort_24
|
25
|
|
Cohort_26
|
24
|
|
Cohort_25
|
24
|
|
Cohort_30
|
23
|
|
Cohort_27
|
23
|
|
Cohort_28
|
23
|
|
Cohort_29
|
23
|
|
Cohort_31
|
22
|
|
Cohort_32
|
19
|
|
Cohort_33
|
19
|
|
Cohort_34
|
17
|
|
Cohort_35
|
15
|
|
Cohort_36
|
10
|
|
Cohort_37
|
8
|
|
Cohort_38
|
8
|
|
Cohort_41
|
7
|
|
Cohort_39
|
7
|
|
Cohort_40
|
7
|
|
Cohort_42
|
6
|
|
Cohort_44
|
6
|
|
Cohort_43
|
6
|
|
Cohort_45
|
6
|
|
Cohort_47
|
4
|
|
Cohort_46
|
4
|
|
Cohort_48
|
3
|
counts_and_meta %>%
select(cohort_code, n_in_cohort) %>%
distinct %>%
pull(n_in_cohort) %>%
median
## [1] 24.5
Table S2 - repositories
repo_info <- read_tsv("inputs/repos.tsv")
## Parsed with column specification:
## cols(
## repo_name = col_character(),
## repo_abbrev = col_character(),
## repo_url = col_character(),
## source_accession_pattern = col_character()
## )
repo_table <- repo_info %>%
filter(repo_abbrev %in% counts_and_meta$source_repository) %>%
rename(`Name`=repo_name, `Abbreviation`=repo_abbrev, URL = repo_url) %>%
select(-source_accession_pattern)
write_tsv(repo_table, "results/Table_S2_repo_table.tsv")
repo_table %>%
kable %>%
kable_styling(bootstrap_options = "striped", full_width = F)
Table S3 - data access table
cohort_table <- counts_and_meta %>%
select(cohort_code, cohort_name, source_repository, repository_cohort_accession, n_in_cohort) %>% # removed DOI link
# group_by(across(c(-disease)))
distinct %>%
arrange(desc(n_in_cohort))
colnames(cohort_table) <- to_any_case(colnames(cohort_table), case = "sentence")
write_tsv(cohort_table, "results/Table_S3_cohort_table.tsv")
cohort_table %>%
kable %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
Cohort code
|
Cohort name
|
Source repository
|
Repository cohort accession
|
N in cohort
|
|
Cohort_1
|
TARGET-10
|
dbGaP
|
phs000218
|
337
|
|
Cohort_2
|
SJC_BALL
|
SJC
|
SJC-DS-1001
|
141
|
|
Cohort_3
|
TARGET-20
|
dbGaP
|
phs000218
|
137
|
|
Cohort_4
|
EGAD00001003279
|
EGA
|
EGAD00001003279
|
127
|
|
Cohort_5
|
SJC_AMLM
|
SJC
|
SJC-DS-1001
|
103
|
|
Cohort_6
|
phs000673
|
dbGaP
|
phs000673
|
89
|
|
Cohort_7
|
TARGET-40
|
dbGaP
|
phs000218
|
88
|
|
Cohort_8
|
phs000720
|
dbGaP
|
phs000720
|
84
|
|
Cohort_9
|
SJC_HGG
|
SJC
|
SJC-DS-1001
|
82
|
|
Cohort_10
|
SJC_EPD
|
SJC
|
SJC-DS-1001
|
78
|
|
Cohort_11
|
TARGET-52
|
dbGaP
|
phs000218
|
65
|
|
Cohort_12
|
EGAD00001001098
|
EGA
|
EGAD00001001098
|
63
|
|
Cohort_13
|
phs000768
|
dbGaP
|
phs000768
|
62
|
|
Cohort_14
|
SJC_ETV
|
SJC
|
SJC-DS-1001
|
56
|
|
Cohort_15
|
SJC_LGG
|
SJC
|
SJC-DS-1001
|
54
|
|
Cohort_16
|
SJC_CBF
|
SJC
|
SJC-DS-1001
|
44
|
|
Cohort_17
|
SJC_RHB
|
SJC
|
SJC-DS-1001
|
43
|
|
Cohort_18
|
EGAD00001001620
|
EGA
|
EGAD00001001620
|
39
|
|
Cohort_19
|
phs000699
|
dbGaP
|
phs000699
|
35
|
|
Cohort_20
|
CBTTC
|
Cavatica
|
CBTTC
|
31
|
|
Cohort_21
|
SJC_ERG
|
SJC
|
SJC-DS-1001
|
31
|
|
Cohort_22
|
SJC_PHALL
|
SJC
|
SJC-DS-1001
|
26
|
|
Cohort_23
|
EGAD00001001666
|
EGA
|
EGAD00001001666
|
25
|
|
Cohort_24
|
SJC_CPC
|
SJC
|
SJC-DS-1001
|
25
|
|
Cohort_26
|
phs000900
|
dbGaP
|
phs000900
|
24
|
|
Cohort_25
|
EGAD00001000648
|
EGA
|
EGAD00001000648
|
24
|
|
Cohort_30
|
TARGET-21
|
dbGaP
|
phs000218
|
23
|
|
Cohort_27
|
EGAD00001000356
|
EGA
|
EGAD00001000356
|
23
|
|
Cohort_28
|
EGAD00001000617
|
EGA
|
EGAD00001000617
|
23
|
|
Cohort_29
|
SJC_OS
|
SJC
|
SJC-DS-1001
|
23
|
|
Cohort_31
|
EGAD00001001927
|
EGA
|
EGAD00001001927
|
22
|
|
Cohort_32
|
EGAD00001002680
|
EGA
|
EGAD00001002680
|
19
|
|
Cohort_33
|
SRP126664
|
SRA
|
SRP126664
|
19
|
|
Cohort_34
|
EGAD00001000158
|
EGA
|
EGAD00001000158
|
17
|
|
Cohort_35
|
SRP092501
|
SRA
|
SRP092501
|
15
|
|
Cohort_36
|
EGAD00001000826
|
EGA
|
EGAD00001000826
|
10
|
|
Cohort_37
|
SJC_E
|
SJC
|
SJC-DS-1001
|
8
|
|
Cohort_38
|
SJC_MB
|
SJC
|
SJC-DS-1001
|
8
|
|
Cohort_41
|
TARGET-30
|
dbGaP
|
phs000218
|
7
|
|
Cohort_39
|
SJC_HYPO
|
SJC
|
SJC-DS-1001
|
7
|
|
Cohort_40
|
SJC_MEL
|
SJC
|
SJC-DS-1001
|
7
|
|
Cohort_42
|
EGAD00001000328
|
EGA
|
EGAD00001000328
|
6
|
|
Cohort_44
|
SJC_Other
|
SJC
|
SJC-DS-1001
|
6
|
|
Cohort_43
|
SJC_INF
|
SJC
|
SJC-DS-1001
|
6
|
|
Cohort_45
|
SJC_WLM
|
SJC
|
SJC-DS-1001
|
6
|
|
Cohort_47
|
TARGET-50
|
dbGaP
|
phs000218
|
4
|
|
Cohort_46
|
SRP006575
|
SRA
|
SRP006575
|
4
|
|
Cohort_48
|
SRP040454
|
SRA
|
SRP040454
|
3
|
Race and gender in dataset
counts_and_meta %>%
tabyl(race)
## race n percent valid_percent
## Asian 27 0.0123910050 0.044850498
## Black/African American 70 0.0321248279 0.116279070
## Native Hawaiian or Other Pacific Islander 3 0.0013767783 0.004983389
## not-white (no other info provided) 1 0.0004589261 0.001661130
## Other 7 0.0032124828 0.011627907
## White 494 0.2267094998 0.820598007
## <NA> 1577 0.7237264800 NA
counts_and_meta %>%
select(race) %>%
na.omit() %>%
tabyl(race) %>%
adorn_totals("row")
## Adding missing grouping variables: `cohort_code`
## race n percent
## Asian 27 0.044850498
## Black/African American 70 0.116279070
## Native Hawaiian or Other Pacific Islander 3 0.004983389
## not-white (no other info provided) 1 0.001661130
## Other 7 0.011627907
## White 494 0.820598007
## Total 602 1.000000000
counts_and_meta %>%
select(ethnicity) %>%
na.omit() %>%
tabyl(ethnicity) %>%
adorn_totals("row")
## Adding missing grouping variables: `cohort_code`
## ethnicity n percent
## Hispanic or Latino 128 0.1486643
## Not Hispanic or Latino 733 0.8513357
## Total 861 1.0000000
counts_and_meta %>%
filter(ethnicity == "Hispanic or Latino") %>%
tabyl(race)
## race n percent valid_percent
## Black/African American 1 0.0078125 0.01123596
## Other 3 0.0234375 0.03370787
## White 85 0.6640625 0.95505618
## <NA> 39 0.3046875 NA
counts_and_meta %>%
filter(ethnicity == "Hispanic or Latino") %>%
select(race) %>%
na.omit() %>%
tabyl(race) %>%
adorn_totals("row")
## Adding missing grouping variables: `cohort_code`
## race n percent
## Black/African American 1 0.01123596
## Other 3 0.03370787
## White 85 0.95505618
## Total 89 1.00000000
Genders in dataset
table(counts_and_meta$gender, useNA = "always")
##
## female male <NA>
## 709 983 487
# samples with reported genders
counts_and_meta %>%
filter(! is.na(gender)) %>%
filter(! gender %in% c("not reported", "unknown")) %>%
tabyl(gender) %>%
adorn_totals
## gender n percent
## female 709 0.4190307
## male 983 0.5809693
## Total 1692 1.0000000
Ages in dataset
table(counts_and_meta$pedaya, useNA = "always")
##
## No Yes, age < 30 years <NA>
## 67 2088 24
# at least one target sample is >30
n_ped_aya <- sum(counts_and_meta$pedaya=="Yes, age < 30 years" | counts_and_meta$age_at_dx<30, na.rm = TRUE)
n_ped_aya
## [1] 2088
n_ped_aya/nrow(counts_and_meta)
## [1] 0.9582377
# more than 95% of which are pediatric <30; of the Datasets with specific ages at diagnosis reported, the median was 7
median(counts_and_meta$age_at_dx, na.rm = TRUE)
## [1] 8.32
table(is.na(counts_and_meta$age_at_dx))
##
## FALSE TRUE
## 1472 707
Read lengths in dataset
seq_len_round_25 <- round(counts_and_meta$seq_length / 25) * 25
table(seq_len_round_25, useNA = "always")
## seq_len_round_25
## 25 50 75 100 125 <NA>
## 4 238 366 1544 27 0
tabyl(seq_len_round_25)
## seq_len_round_25 n percent
## 25 4 0.001835704
## 50 238 0.109224415
## 75 366 0.167966957
## 100 1544 0.708581918
## 125 27 0.012391005
table(is.na(counts_and_meta$seq_length))
##
## FALSE
## 2179
table(counts_and_meta$seq_length)
##
## 36 50 51 75 76 81 100 101 110 125
## 4 11 227 279 40 47 98 1431 15 27
summary(counts_and_meta$seq_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 36.00 76.00 101.00 91.51 101.00 125.00
IQR(counts_and_meta$seq_length, na.rm = TRUE)
## [1] 25
old_read_length_histogram <- ggplot(counts_and_meta) +
geom_histogram(aes(seq_length, group=cohort_code),
fill = "lightgrey", color = "black", stat = StatBin2) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
xlab("Sequence length") +
ylab("Datasets")
read_length_histogram <- ggplot(counts_and_meta) +
geom_histogram(aes(seq_length),
fill = "lightgrey", color = "black", stat = StatBin2,
binwidth = 1) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
xlab("Sequence length") +
ylab("Datasets") +
expand_limits(x=0)
read_length_histogram
# Figure 1
figure_name <- "f1"
f1 <- plot_grid(cohort_size_histogram,
read_length_histogram,
nrow=2,
labels = c("A", "B")) +
draw_label(paste(figure_name, Sys.time()),
x = 0, y = 0.02, hjust = 0, size = 6)
#
# right_side <- plot_grid(read_type_schematic,
# read_type_fractions_schematic,
# nrow=2,
# rel_heights = c(1, 2),
# labels = c("A", "B"))
#
# f2 <- plot_grid(left_side,
# right_side,
# ncol = 2) +
# draw_label(paste(figure_name, Sys.time()),
# x = 0, y = 0.02, hjust = 0, size = 6)
f1

ggsave("results/Figure_1.png", f1, height=6, width=10)
#
#
# f1 <- plot_grid(cohort_size_histogram,
# read_length_histogram,
# nrow=2,
# labels = c("A", "B")) +
# draw_label(paste(figure_name, Sys.time()),
# x = 0, y = 0.02, hjust = 0, size = 6)
Check for counting errors
if(nrow(subset(counts_and_meta, Mapped > Total_reads)) > 0) stop("Counting error")
if(nrow(subset(counts_and_meta, MND > Mapped)) > 0) stop("Counting error")
if(nrow(subset(counts_and_meta, MEND > MND)) > 0) stop("Counting error")
Summarize total read counts
min(counts_and_meta$Total_reads)
## [1] 206916
Total_read_counts <- counts_and_meta %>%
ungroup %>%
mutate(dataset_value = Total_reads/1e6) %>%
summarize(variance= var(dataset_value),
min = min(dataset_value),
max = max(dataset_value),
median = median(dataset_value),
p25 = quantile(dataset_value, 0.25),
p75 = quantile(dataset_value, 0.75),
iqr = IQR(dataset_value))
kable(Total_read_counts %>%
select(-variance, -p25, -p75), digits = 1)
|
min
|
max
|
median
|
iqr
|
|
0.2
|
668
|
61.2
|
53.3
|
total_read_count_histogram <- ggplot(counts_and_meta) +
geom_histogram(aes(Total_reads/1E6),
fill = "lightgrey", color = "black", stat = StatBin2) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
xlab("Total reads") +
ylab("Datasets")
total_read_count_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate percentages
read_counts_with_read_type_fractions <- counts_and_meta %>%
arrange(desc(Total_reads)) %>%
mutate(
pct_unmapped_of_total = (Total_reads - Mapped) / Total_reads,
pct_dupe_of_mapped = (Mapped - MND) / Mapped,
pct_non_exonic_of_non_dupe = (MND - MEND) / MND
)
read_type_fractions_long <- read_counts_with_read_type_fractions %>%
pivot_longer(cols = starts_with("pct_"), names_to = "read_type_fraction_name", values_to = "dataset_value")
Summarize read type fractions
comparison_of_distributions <- read_type_fractions_long %>%
group_by(read_type_fraction_name) %>%
summarize(variance= var(dataset_value),
min = min(dataset_value),
max = max(dataset_value),
median = median(dataset_value),
p25 = quantile(dataset_value, 0.25),
p75 = quantile(dataset_value, 0.75),
iqr = IQR(dataset_value))
kable(comparison_of_distributions %>%
mutate_if(is.double, ~100*.), digits = 3) %>%
kable_styling(bootstrap_options = "striped", full_width = T)
|
read_type_fraction_name
|
variance
|
min
|
max
|
median
|
p25
|
p75
|
iqr
|
|
pct_dupe_of_mapped
|
5.884
|
2.853
|
99.997
|
26.887
|
13.450
|
43.033
|
29.583
|
|
pct_non_exonic_of_non_dupe
|
4.094
|
4.495
|
97.000
|
24.769
|
16.189
|
37.277
|
21.088
|
|
pct_unmapped_of_total
|
0.605
|
0.710
|
76.677
|
3.414
|
2.740
|
6.006
|
3.266
|
kable(comparison_of_distributions %>%
select(-variance, -p25, -p75) %>%
mutate_if(is.double, ~100*.), digits = 0) %>%
kable_styling(bootstrap_options = "striped", full_width = T)
|
read_type_fraction_name
|
min
|
max
|
median
|
iqr
|
|
pct_dupe_of_mapped
|
3
|
100
|
27
|
30
|
|
pct_non_exonic_of_non_dupe
|
4
|
97
|
25
|
21
|
|
pct_unmapped_of_total
|
1
|
77
|
3
|
3
|
statements about read type fractions
See comparison_of_distributions table above
unmapped
# in 77 datasets, more than 25% of reads are unmapped
table(read_counts_with_read_type_fractions$pct_unmapped_of_total>0.25)
##
## FALSE TRUE
## 2102 77
duplicate
# 426 datasets have more than 50% duplicates (Fig 2A).
read_counts_with_read_type_fractions %>%
mutate(above_50 = pct_dupe_of_mapped>0.50) %>%
tabyl(above_50)
## above_50 n percent
## FALSE 1753 0.8044975
## TRUE 426 0.1955025
fiftypct <- read_counts_with_read_type_fractions %>%
group_by(cohort_code) %>%
summarize(has_a_dataset_with_more_than_50pct_dupes = any(pct_dupe_of_mapped>0.50),
median_pct_dupe_of_mapped = median(pct_dupe_of_mapped)) %>%
mutate(median_lt_50 = median_pct_dupe_of_mapped < 0.5,
median_lt_50_and_has_a_dataset = median_lt_50 & has_a_dataset_with_more_than_50pct_dupes)
fiftypct %>% tabyl(median_lt_50)
## median_lt_50 n percent
## FALSE 7 0.1458333
## TRUE 41 0.8541667
fiftypct %>% tabyl(has_a_dataset_with_more_than_50pct_dupes)
## has_a_dataset_with_more_than_50pct_dupes n percent
## FALSE 15 0.3125
## TRUE 33 0.6875
fiftypct %>% tabyl(median_lt_50_and_has_a_dataset)
## median_lt_50_and_has_a_dataset n percent
## FALSE 22 0.4583333
## TRUE 26 0.5416667
exonic
read_counts_with_read_type_fractions %>%
mutate(above_50 = pct_non_exonic_of_non_dupe>0.50) %>%
tabyl(above_50)
## above_50 n percent
## FALSE 1849 0.8485544
## TRUE 330 0.1514456
fiftypcte <- read_counts_with_read_type_fractions %>%
group_by(cohort_code) %>%
summarize(has_a_dataset_with_more_than_50pct_ne = any(pct_non_exonic_of_non_dupe>0.50),
median_pct_dupe_of_mapped = median(pct_non_exonic_of_non_dupe)) %>%
mutate(median_lt_50 = median_pct_dupe_of_mapped < 0.5,
median_lt_50_and_has_a_dataset = median_lt_50 & has_a_dataset_with_more_than_50pct_ne)
fiftypcte %>% tabyl(median_lt_50)
## median_lt_50 n percent
## FALSE 3 0.0625
## TRUE 45 0.9375
fiftypcte %>% tabyl(has_a_dataset_with_more_than_50pct_ne)
## has_a_dataset_with_more_than_50pct_ne n percent
## FALSE 36 0.75
## TRUE 12 0.25
fiftypcte %>% tabyl(median_lt_50_and_has_a_dataset)
## median_lt_50_and_has_a_dataset n percent
## FALSE 39 0.8125
## TRUE 9 0.1875
Correlation of depth of sequencing to duplicate read fraction
Does the depth of sequencing correlate to the number of duplicates? How much variance does that explain?
cor_tot_reads_and_dupes <- cor(read_counts_with_read_type_fractions$Total_reads,
read_counts_with_read_type_fractions$pct_dupe_of_mapped,
method = c("pearson"))
round(cor_tot_reads_and_dupes, 2)
## [1] 0.58
round(cor_tot_reads_and_dupes^2, 2)
## [1] 0.34
Generate schematic defining read types
colors_for_read_types <- these_colors_with_MEND
names(colors_for_read_types) <- gsub(" reads", "", names(these_colors_with_MEND))
colors_for_read_types <- c(colors_for_read_types,
"Genome" = "darkblue",
"Exon" = "darkblue",
"Mapped reads" = "darkgrey")
plot1_colnames <- c("label", "x1", "x2", "y1", "y2")
read_type_schematic_data_as_vector <- c("Duplicate", 3, 4, 5, 6,
"MEND", 3, 4, 4, 5,
"MEND", 2.5, 3.5, 3, 4,
"Non-exonic", 4.5, 5.5, 3, 4,
"Unmapped", 6, 7, 3, 4,
"Exon", 2.5, 4, 1, 2,
"Mapped reads", 2.8, 2.8, 6, 7)
rows_in_schematic <- length(read_type_schematic_data_as_vector)/length(plot1_colnames)
read_type_schematic_data <- matrix(read_type_schematic_data_as_vector,
ncol = length(plot1_colnames), byrow = TRUE,
dimnames = list(c(1:rows_in_schematic), plot1_colnames)) %>%
as_tibble %>%
type_convert() %>%
mutate(
base_color = colors_for_read_types[match(label, names(colors_for_read_types))],
border_color = case_when(
label %in% c("Mapped reads", "Genome", "MEND", "Exon") ~ NA_character_,
TRUE ~ base_color),
fill_color = ifelse(label %in% c("MEND", "Exon"), base_color, "white"),
text_color = case_when(
label %in% c("MEND", "Exon")~ "white",
TRUE ~ base_color),
mid_bar_x = map2_dbl(x1, x2, function(x, y) mean(c(x,y))),
mid_bar_y = map2_dbl(y1, y2, function(x, y) mean(c(x,y)))
)
## Parsed with column specification:
## cols(
## label = col_character(),
## x1 = col_double(),
## x2 = col_double(),
## y1 = col_double(),
## y2 = col_double()
## )
padding_size = 0.35
end_of_box <- sort(read_type_schematic_data$x2,partial=length(read_type_schematic_data$x2)-1)[length(read_type_schematic_data$x2)-1] + padding_size
read_type_schematic <- ggplot(read_type_schematic_data,
aes(xmin=x1, xmax=x2, ymin=y1, ymax = y2,
fill = fill_color, color = border_color)) +
geom_rect() +
geom_segment(x=min(read_type_schematic_data$x1-padding_size), y=1.5, xend = 5.75, yend = 1.5, color = "darkblue") +
geom_text(aes(label = label, x = mid_bar_x, y=mid_bar_y,
color = text_color),
size = 4,
fontface = "bold") +
geom_rect(aes(xmin = min(x1 - padding_size),
ymin = 2.5,
xmax = end_of_box,
ymax=max(y2 + padding_size)),
fill = NA, color = "darkgrey", size = 0.25) +
scale_fill_identity() +
scale_color_identity() +
theme_void() +
theme(
panel.grid.major.x = element_line(color="lightgrey", size=0.2),
) +
scale_x_continuous(
breaks = seq(min(read_type_schematic_data$x1) - padding_size + 0.25, end_of_box , 0.25))
read_type_schematic

Generate schematic defining read type fractions
stat_names <- c("pct_unmapped_of_total", "pct_dupe_of_mapped", "pct_non_exonic_of_non_dupe")
stat_labels <- c("Unmapped", "Duplicate", "Non-exonic")
complement_stat_names <- c("Mapped reads", "Non-duplicate reads", "Exonic reads")
divisor_name = c("total", "mapped", "non-duplicate")
comparison_of_remaining <- read_type_fractions_long %>%
mutate(read_type_fraction_name = complement_stat_names[match(read_type_fraction_name, stat_names)]) %>%
group_by(read_type_fraction_name) %>%
summarize(min = round(1-max(dataset_value), 2),
max = round(1-min(dataset_value), 2),
median = round(1-median(dataset_value), 2),
p25 = round(1-quantile(dataset_value, 0.25), 2),
p75 = round(1-quantile(dataset_value, 0.75), 2)) %>%
# mutate(bar_label = paste0 (read_type_fraction_name, "\n(", 100*min, "-", 100*max, "% of ", divisor_name[match(read_type_fraction_name, complement_stat_names)], "; x͂=",100* median, "%)"))
mutate(bar_label = paste0 (read_type_fraction_name, "\n(", 100*min, "-", 100*max, "% of ", divisor_name[match(read_type_fraction_name, complement_stat_names)], "; med.=", 100* median, "%)"))
positive_bars <- comparison_of_remaining %>%
select(-min, -max, -p25, -p75) %>%
mutate(read_type_fraction_label = read_type_fraction_name,
position = "1") %>%
rename(abs_median = median)
negative_bars <- tibble(read_type_fraction_name = positive_bars$read_type_fraction_name,
read_type_fraction_label = stat_labels[match(read_type_fraction_name, complement_stat_names)],
bar_label = read_type_fraction_label,
abs_median = 1-positive_bars$abs_median,
position = "2")
total_bar <- tibble(read_type_fraction_name = "Total reads",
read_type_fraction_label = read_type_fraction_name,
bar_label = "Total reads\n(100% of reads)",
abs_median = 1,
position = "1"
)
MEND_stats_of_total <- counts_and_meta %>%
arrange(desc(Total_reads)) %>%
mutate(
pct_MEND_of_total = (Total_reads - MEND) / Total_reads
) %>%
ungroup %>%
summarize(min = round(1-max(pct_MEND_of_total), 2),
max = round(1-min(pct_MEND_of_total), 2),
median = round(1-median(pct_MEND_of_total), 2))
MEND_bar <- tibble(read_type_fraction_name = "MEND reads",
read_type_fraction_label = read_type_fraction_name,
bar_label = with(MEND_stats_of_total, paste0 ("MEND reads\n(", 100*min, "-", 100*max, "% of total reads; med.=",100* median, "%)")),
abs_median = 1,
position = "1"
)
bar_names_in_order <- c("Total reads", "Mapped reads", "Non-duplicate reads", "Exonic reads", "MEND reads")
all_bars <- bind_rows(positive_bars, negative_bars, total_bar, MEND_bar) %>%
mutate(read_type_fraction_name = factor(read_type_fraction_name,
levels = bar_names_in_order))
cat_space <- all_bars %>% filter(position == 1) %>%
arrange(read_type_fraction_name) %>%
select(read_type_fraction_name, abs_median) %>%
ungroup %>%
mutate(lagged1_abs_median = lag(abs_median, default =1),
lagged2_abs_median = lag(abs_median, n=2, default =1),
category_space = lagged1_abs_median*lagged2_abs_median
) %>%
select(read_type_fraction_name, category_space)
these_colors_with_MEND = c(these_colors, "MEND reads"="#000000")
all_bars_rel <- left_join(all_bars, cat_space, by = "read_type_fraction_name") %>%
mutate(rel_median=ifelse(read_type_fraction_name == "MEND reads",
abs_median[read_type_fraction_label=="Exonic reads"]*
category_space[read_type_fraction_label=="Exonic reads"],
abs_median*category_space),
read_type_fraction_name = factor(read_type_fraction_name, levels = rev(bar_names_in_order)),
position = factor(position, levels = rev(sort(unique(position)))),
bar_color = these_colors_with_MEND[match(read_type_fraction_name, names(these_colors_with_MEND))],
fill_val = ifelse(position == 1, bar_color, NA),
border_color_val = bar_color,
text_color = ifelse(position ==1, "white", "black"),
font_size = ifelse(abs_median<0.1, 3,3.5),
text_angle = ifelse(abs_median<0.1, 90,0)
)
read_type_fractions_schematic
read_type_fractions_schematic <- ggplot(all_bars_rel, aes(y=read_type_fraction_name,
x=rel_median, group = position)) +
geom_col(aes(fill = fill_val,
color = border_color_val
)) +
geom_text(aes(label=bar_label,
color = text_color,
size = font_size,
angle= text_angle),
position = position_stack(vjust = .5),
fontface = "bold") +
scale_fill_identity() +
scale_color_identity() +
scale_size_identity() +
theme_void() +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
ggplot(all_bars_rel, aes(y=read_type_fraction_name,
x=rel_median, group = position)) +
geom_col(aes(fill = fill_val,
color = border_color_val
)) +
geom_text(aes(label=read_type_fraction_label,
color = text_color,
size = font_size,
angle = text_angle),
position = position_stack(vjust = .5),
fontface = "bold") +
scale_fill_identity() +
scale_color_identity() +
scale_size_identity() +
theme_void() +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())

read_type_fractions_schematic

Plot read fractions
colors_for_stat_names <- these_colors[c("Mapped reads", "Non-duplicate reads","Exonic reads")]
names(colors_for_stat_names) <- stat_names
these_colors <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C",
"#FDBF6F", "#FF7F00")
# light blue, dark blue, etc: green, red, orange (then purple and brown)
names(these_colors) <- c("not assigned", "Total reads", "Non-exonic reads", "Exonic reads", "Duplicate reads", "Non-duplicate reads", "Unmapped reads", "Mapped reads")
# 6 x 11 was a good ratio, but the text was too small
read_type_fractions_long_for_histogram <- read_type_fractions_long %>%
mutate(read_type_fraction_name = factor(read_type_fraction_name,
levels = stat_names))
read_type_fractions_histogram <- ggplot(read_type_fractions_long_for_histogram) +
geom_histogram(aes(x = dataset_value, color = read_type_fraction_name),
fill = "white", stat = StatBin2) +
scale_color_manual(values = colors_for_stat_names) +
facet_wrap(~read_type_fraction_name,
nrow = 1,
scales = "free_y",
labeller = labeller(
read_type_fraction_name = stat_label
)
) +
ylab("Datasets") +
xlab(paste0("Read type percentages in ", length(unique(read_type_fractions_long_for_histogram$dataset_id)), " datasets")) +
scale_x_continuous(breaks = seq(0, 1, by = 0.5)) +
theme_minimal() +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
theme(strip.text.x = element_text(size = 12, face = "bold")) +
theme(legend.position = "none")
read_type_fractions_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate MEND reads as a fraction of total
read_counts_with_MEND_fractions <- counts_and_meta %>%
arrange(desc(Total_reads)) %>%
mutate(pct_MEND_of_total = MEND /Total_reads)
ggplot(read_counts_with_MEND_fractions) +
geom_histogram(aes(pct_MEND_of_total), stat = StatBin2) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

MEND_pct <- read_counts_with_MEND_fractions %>%
ungroup %>%
mutate(dataset_value = pct_MEND_of_total) %>%
summarize(variance= var(dataset_value),
min = min(dataset_value),
max = max(dataset_value),
median = median(dataset_value),
p25 = quantile(dataset_value, 0.25),
p75 = quantile(dataset_value, 0.75),
iqr = IQR(dataset_value))
kable(MEND_pct %>%
select(-variance, -p25, -p75) %>%
mutate_if(is.double, ~100*.), digits = 0) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
|
min
|
max
|
median
|
iqr
|
|
0
|
79
|
50
|
31
|
head(sort(read_counts_with_MEND_fractions$pct_MEND_of_total))
## [1] 0.0000267009 0.0002533711 0.0003651723 0.0003767257 0.0003997294
## [6] 0.0004279584
sum(read_counts_with_MEND_fractions$pct_MEND_of_total<0.25)
## [1] 355
read_counts_with_MEND_fractions %>%
ungroup %>%
mutate(pct_under_25 = pct_MEND_of_total<0.25) %>%
tabyl(pct_under_25) %>%
adorn_totals()
## pct_under_25 n percent
## FALSE 1824 0.8370812
## TRUE 355 0.1629188
## Total 2179 1.0000000
read_counts_with_MEND_fractions %>%
ungroup %>%
filter(Mapped > 30E6) %>%
mutate(pct_under_25 = pct_MEND_of_total<0.25) %>%
tabyl(pct_under_25) %>%
adorn_totals()
## pct_under_25 n percent
## FALSE 1739 0.8368624
## TRUE 339 0.1631376
## Total 2078 1.0000000
Show per-cohort distribution of read_type_fractions
figure_name <- "fracs_by_group"
read_type_fractions_long_for_boxplot <- read_type_fractions_long_for_histogram %>%
ungroup %>%
mutate(boxplot_label = fct_reorder(paste0(as.character(cohort_code), ", n=", n_in_cohort), n_in_cohort))
read_type_fractions_per_cohort_boxplot <- ggplot(read_type_fractions_long_for_boxplot) +
geom_boxplot(aes(x = dataset_value, y=boxplot_label), outlier.size = 0.5) +
facet_wrap(~read_type_fraction_name,
nrow = 1,
labeller = labeller(
read_type_fraction_name = stat_label
)
) +
ylab("Cohorts") +
xlab(paste0("Read type percentages in ", length(unique(read_type_fractions_long_for_boxplot$dataset_id)), " datasets")) +
scale_x_continuous(breaks = seq(0, 1, by = 0.5)) +
theme_minimal() +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
theme(strip.text.x = element_text(size = 12, face = "bold")) +
theme(legend.position = "none")
read_type_fractions_per_cohort_boxplot

Characterize Cohort 4
read_type_names <- c("Total_reads", "Mapped", "MND", "MEND")
this_cohort_name <- "EGAD00001003279"
this_cohort_code <- subset(read_counts_with_read_type_fractions, cohort_name == this_cohort_name)$cohort_code[1]
this_cohort_code
## [1] "Cohort_4"
in MS
read_counts_with_read_type_fractions %>%
filter(cohort_code == this_cohort_code) %>%
mutate(gt_98 = pct_dupe_of_mapped > 0.98) %>%
tabyl(gt_98) %>%
add_totals_row
## Warning: 'add_totals_row' is deprecated.
## Use 'adorn_totals("row")' instead.
## See help("Deprecated")
## gt_98 n percent
## FALSE 55 0.4330709
## TRUE 72 0.5669291
## Total 127 1.0000000
read_counts_with_read_type_fractions %>%
filter(cohort_code == this_cohort_code) %>%
filter(pct_dupe_of_mapped > 0.98) %>%
pull(Total_reads) %>% min
## [1] 178294341
other
counts_and_meta_long <- counts_and_meta %>%
pivot_longer(cols = c(Total_reads, Mapped, MEND, MND), names_to = "read_type_name", values_to = "dataset_value") %>%
ungroup %>%
mutate(read_type_name = factor(read_type_name, levels = read_type_names))
counts_and_meta_long_one_proj <- counts_and_meta_long %>%
filter(cohort_code == this_cohort_code)
table(counts_and_meta_long_one_proj$read_type_name)
##
## Total_reads Mapped MND MEND
## 127 127 127 127
# 78 datasets in the cohort have fewer than 0.2 million MEND reads.
sum((counts_and_meta_long_one_proj %>% filter(read_type_name=="MEND"))$dataset_value < 200000)
## [1] 78
those_datasets <- counts_and_meta_long_one_proj %>% filter(read_type_name=="MEND", dataset_value < 200000) %>% pull(dataset_id)
summary((counts_and_meta_long_one_proj %>% filter(read_type_name=="Total_reads", dataset_id %in% those_datasets))$dataset_value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 178294341 199916664 208667776 211606891 223832191 247618832
max_plots_to_include <- 10
n_datasets_to_plot <- ifelse(n_distinct(counts_and_meta_long_one_proj$dataset_id)>max_plots_to_include,
max_plots_to_include,
n_distinct(counts_and_meta_long_one_proj$dataset_id))
plot_word <- ifelse(n_datasets_to_plot==max_plots_to_include, max_plots_to_include, paste("all", n_distinct(counts_and_meta_long_one_proj$dataset_id)))
set.seed(100)
datasets_to_plot <- sample(unique(counts_and_meta_long_one_proj$dataset_id), n_datasets_to_plot)
ggplot(subset(counts_and_meta_long_one_proj, dataset_id %in% datasets_to_plot)) +
geom_col(aes(x=read_type_name, y=dataset_value/1e6, fill = read_type_name)) +
facet_wrap(~dataset_id, ncol = 1, strip.position="right") +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
theme(strip.text.y = element_text(angle = 0)) +
scale_fill_brewer(palette = "Set1") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
ggtitle(paste("Read counts for", plot_word, "datasets from", this_cohort_code)) +
coord_flip()

ggplot(counts_and_meta_long_one_proj %>% filter(read_type_name == "MEND")) +
geom_histogram(aes(dataset_value/1e6), stat = StatBin2) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
ggtitle(paste("Distribution of MEND counts in", this_cohort_code))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

bl <- colorRampPalette(c("navy","royalblue","lightskyblue"))(200)
re <- colorRampPalette(c("mistyrose", "red2","darkred"))(200)
ggplot(counts_and_meta %>%
filter(cohort_code == this_cohort_code) %>%
mutate(pct_dupes = (Mapped - MND)/Mapped)) +
geom_point(aes(x=MEND/1e6, y=pct_dupes, fill = Total_reads/1e6), shape=21, color = "black") +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
scale_fill_gradientn(colours = c(bl,"white", re)) +
ggtitle(paste("Relationship of dupe fraction to MEND count in", this_cohort_code), "For detecting whether datasets with less info were sequenced more deeply. \nNormally, higher depth datasets have more MEND reads and more duplicate \nreads than the same dataset sequenced at a lower total depth.")

counts_and_meta_long_one_proj %>%
select(seq_length, dataset_id) %>%
distinct %>%
tabyl(seq_length)
## seq_length n percent
## 51 127 1
Expressed genes
What is the distribution of measured genes?
gene_counts_long <- counts_and_meta %>%
pivot_longer(cols = starts_with("genes_"), names_to = "Expression_threshold") %>%
mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))
ggplot(gene_counts_long) +
geom_histogram(aes(x=value/1e3, fill = Expression_threshold), stat = StatBin) +
facet_wrap(~Expression_threshold) +
theme_minimal() +
scale_fill_brewer(palette = "Set1") +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
theme(legend.position = "none") +
ylab("Datasets") +
xlab("Measured genes")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What are the exact lowest gene counts?
sort(counts_and_meta$genes_expressed_above_0) [1:100]
## [1] 9 9 12 12 13 14 14 15 16 16 17
## [12] 18 18 18 18 18 18 19 19 19 19 20
## [23] 20 20 20 20 20 21 21 21 21 21 21
## [34] 22 22 22 23 23 23 23 23 24 24 24
## [45] 24 24 24 24 25 25 25 26 26 26 26
## [56] 27 27 27 27 28 28 28 28 29 29 29
## [67] 29 29 30 30 31 31 32 33 35 36 37
## [78] 40 6098 7107 11830 15323 16793 16837 16943 17233 17531 18288
## [89] 18588 18704 19626 19693 19749 19781 19942 19963 20063 20120 20203
## [100] 20247
correlations with all datasets
# library(corrr)
counts_and_meta %>%
ungroup %>%
select_if(is.double) %>%
correlate() %>%
filter(rowname %in% read_type_names) %>%
rename(Read_type_count=rowname) %>%
mutate(Read_type_count = factor(Read_type_count, levels = read_type_names)) %>%
arrange(Read_type_count) %>%
select(c(Read_type_count, starts_with("genes_expressed"))) %>%
kable(caption = "Correlations with all datasets", digits = 2) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
Correlations with all datasets
|
Read_type_count
|
genes_expressed_above_0
|
genes_expressed_above_1
|
genes_expressed_above_2
|
genes_expressed_above_3
|
genes_expressed_above_4
|
genes_expressed_above_5
|
|
Total_reads
|
-0.20
|
-0.40
|
-0.40
|
-0.40
|
-0.39
|
-0.39
|
|
Mapped
|
-0.19
|
-0.41
|
-0.41
|
-0.41
|
-0.40
|
-0.39
|
|
MND
|
0.51
|
0.24
|
0.20
|
0.18
|
0.17
|
0.16
|
|
MEND
|
0.48
|
0.27
|
0.26
|
0.26
|
0.26
|
0.26
|
correlations with all but low gene count datasets
counts_and_meta %>%
filter(genes_expressed_above_0 > min_genes_gt_0) %>%
ungroup %>%
select_if(is.double) %>%
correlate() %>%
filter(rowname %in% read_type_names) %>%
rename(Read_type_count=rowname) %>%
mutate(Read_type_count = factor(Read_type_count, levels = read_type_names)) %>%
arrange(Read_type_count) %>%
select(c(Read_type_count, starts_with("genes_expressed"))) %>%
kable(caption = "Correlations with all but low gene count datasets", digits = 2) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
Correlations with all but low gene count datasets
|
Read_type_count
|
genes_expressed_above_0
|
genes_expressed_above_1
|
genes_expressed_above_2
|
genes_expressed_above_3
|
genes_expressed_above_4
|
genes_expressed_above_5
|
|
Total_reads
|
0.13
|
-0.26
|
-0.28
|
-0.28
|
-0.28
|
-0.28
|
|
Mapped
|
0.21
|
-0.25
|
-0.26
|
-0.27
|
-0.27
|
-0.27
|
|
MND
|
0.51
|
0.04
|
0.02
|
0.00
|
0.00
|
0.00
|
|
MEND
|
0.44
|
0.08
|
0.09
|
0.11
|
0.11
|
0.12
|
Plot correlations betweeen gene counts and read types
Groom data
expr_gene_and_read_counts_to_plot <- counts_and_meta %>%
pivot_longer (cols = c(MND, MEND, Total_reads, Mapped), names_to = "Read_type", values_to = "Read_count") %>%
pivot_longer (cols = starts_with("genes_expressed"), names_to = "Expression_threshold", values_to = "Gene_count") %>%
mutate(Read_type = factor(Read_type, levels = read_type_names))
Identify datasets with unusual gene counts or read depths
datasets_with_normal_expressed_gene_numbers <- counts_and_meta %>%
filter(genes_expressed_above_0 > min_genes_gt_0) %>%
pull(dataset_id)
datasets_with_outlier_gene_counts <- counts_and_meta %>%
ungroup %>%
filter(is_outlier(genes_expressed_above_0)) %>%
pull(dataset_id)
datasets_with_outlier_depths <- counts_and_meta %>%
ungroup %>%
filter(is_outlier(Total_reads)) %>%
pull(dataset_id)
Function for calculating gene count correlations and plotting results
plot_gene_count_correlations <- function(this_data = expr_gene_and_read_counts_to_plot, this_plot_title = "gene count correlations"){
this_data <- this_data %>%
mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))
these_stats <- this_data %>%
group_by(Read_type, Expression_threshold) %>%
summarize(r_corr = round(cor(Gene_count, Read_count), 2))
# unique(this_data$Read_type)
# dput(unique(this_data$Read_type))
# these_colors_with_MEND
colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00",
"MND"="#E31A1C", "MEND"="#000000")
ggplot(this_data,
aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) +
geom_point(alpha = 0.5, shape = 20, size =0.1) +
geom_smooth(method = "lm", color = "black") +
geom_label(data = these_stats,
aes(label=paste0("r=", r_corr)),
x=max(this_data$Read_count/1E6),
y=max(this_data$Gene_count/1e3),
hjust = 1, vjust = 1
) +
theme_minimal() +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
scale_color_manual(values = colors_for_correlation_plot) +
facet_grid(Read_type~Expression_threshold) +
ggtitle(this_plot_title) +
theme(legend.position="none") +
xlab("Read counts (million)") +
ylab("Gene counts (thousand)")
}
corr plot for figure
max_total_read_depth <- 250*1e6
print(min_genes_gt_0)
## [1] 100
sum(counts_and_meta$genes_expressed_above_0 <= min_genes_gt_0)
## [1] 78
sum(counts_and_meta$Total_reads > max_total_read_depth)
## [1] 105
# min_genes_gt_0
exclude_for_depth <- counts_and_meta %>%
filter(Total_reads > max_total_read_depth) %>%
pull(dataset_id)
exclude_for_gene_count <- counts_and_meta %>%
filter(genes_expressed_above_0 < min_genes_gt_0) %>%
pull(dataset_id)
this_data <- expr_gene_and_read_counts_to_plot %>%
filter(
Expression_threshold %in% paste0("genes_expressed_above_", 1:3),
! dataset_id %in% exclude_for_depth,
! dataset_id %in% exclude_for_gene_count
) %>%
mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))
n_distinct(this_data$dataset_id)
## [1] 1996
this_plot_title <- paste("Correlations calculated across all but datasets with excessive read depth or low gene count")
these_stats <- this_data %>%
group_by(Read_type, Expression_threshold) %>%
summarize(r_corr = round(cor(Gene_count, Read_count), 2))
colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00",
"MND"="#E31A1C", "MEND"="#000000")
cor_plots_excluding_high_read_depth_and_low_gene_count_datasets <- ggplot(this_data,
aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) +
geom_point(alpha = 0.5, shape = 20, size =0.1) +
geom_smooth(method = "lm", color = "black") +
geom_label(data = these_stats,
aes(label=paste0("r=", r_corr)),
x=max(this_data$Read_count/1E6),
y=max(this_data$Gene_count/1e3),
hjust = 1, vjust = 1
) +
theme_minimal() +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5),
strip.text = element_text(face = "bold")) +
scale_color_manual(values = colors_for_correlation_plot) +
facet_grid(Read_type~Expression_threshold) +
theme(legend.position="none") +
scale_x_continuous("Read counts (million)", n.breaks = 4) +
ylab("Gene counts (thousand)")
cor_plots_excluding_high_read_depth_and_low_gene_count_datasets
## `geom_smooth()` using formula 'y ~ x'

Plot sequence length against pct duplicates
ggplot(read_counts_with_read_type_fractions) +
geom_point(aes(x=seq_length, y=pct_dupe_of_mapped))

ggplot(read_counts_with_read_type_fractions) +
geom_boxplot(aes(x=seq_length, y=pct_dupe_of_mapped, group = seq_length))

Generate bibtex formatted citations for packages
used_pkgs <- c("base", "tidyverse", "janitor", "knitr", "cowplot", "RColorBrewer", "pander", "kableExtra", "snakecase", "corrr")
write_bib(used_pkgs)
## @Manual{R-base,
## title = {R: A Language and Environment for Statistical Computing},
## author = {{R Core Team}},
## organization = {R Foundation for Statistical Computing},
## address = {Vienna, Austria},
## year = {2019},
## url = {https://www.R-project.org/},
## }
## @Manual{R-corrr,
## title = {corrr: Correlations in R},
## author = {Edgar Ruiz and Simon Jackson and Jorge Cimentada},
## year = {2019},
## note = {R package version 0.4.0},
## url = {https://CRAN.R-project.org/package=corrr},
## }
## @Manual{R-cowplot,
## title = {cowplot: Streamlined Plot Theme and Plot Annotations for 'ggplot2'},
## author = {Claus O. Wilke},
## year = {2019},
## note = {R package version 1.0.0},
## url = {https://CRAN.R-project.org/package=cowplot},
## }
## @Manual{R-janitor,
## title = {janitor: Simple Tools for Examining and Cleaning Dirty Data},
## author = {Sam Firke},
## year = {2019},
## note = {R package version 1.2.0},
## url = {https://CRAN.R-project.org/package=janitor},
## }
## @Manual{R-kableExtra,
## title = {kableExtra: Construct Complex Table with 'kable' and Pipe Syntax},
## author = {Hao Zhu},
## year = {2019},
## note = {R package version 1.1.0},
## url = {https://CRAN.R-project.org/package=kableExtra},
## }
## @Manual{R-knitr,
## title = {knitr: A General-Purpose Package for Dynamic Report Generation in R},
## author = {Yihui Xie},
## year = {2019},
## note = {R package version 1.25},
## url = {https://CRAN.R-project.org/package=knitr},
## }
## @Manual{R-pander,
## title = {pander: An R 'Pandoc' Writer},
## author = {Gergely Daróczi and Roman Tsegelskyi},
## year = {2018},
## note = {R package version 0.6.3},
## url = {https://CRAN.R-project.org/package=pander},
## }
## @Manual{R-RColorBrewer,
## title = {RColorBrewer: ColorBrewer Palettes},
## author = {Erich Neuwirth},
## year = {2014},
## note = {R package version 1.1-2},
## url = {https://CRAN.R-project.org/package=RColorBrewer},
## }
## @Manual{R-snakecase,
## title = {snakecase: Convert Strings into any Case},
## author = {Malte Grosser},
## year = {2019},
## note = {R package version 0.11.0},
## url = {https://CRAN.R-project.org/package=snakecase},
## }
## @Manual{R-tidyverse,
## title = {tidyverse: Easily Install and Load the 'Tidyverse'},
## author = {Hadley Wickham},
## year = {2017},
## note = {R package version 1.2.1},
## url = {https://CRAN.R-project.org/package=tidyverse},
## }
Does the depth of sequencing correlate to the number of duplicates? How much variance does that explain?
cor_tot_reads_and_dupes <- cor(read_counts_with_read_type_fractions$Total_reads,
read_counts_with_read_type_fractions$pct_dupe_of_mapped,
method = c("pearson"))
round(cor_tot_reads_and_dupes, 2)
## [1] 0.58
round(cor_tot_reads_and_dupes^2, 2)
## [1] 0.34
SessionInfo
pander(sessionInfo(), compact = FALSE)
R version 3.6.0 (2019-04-26)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
locale: en_US.UTF-8||en_US.UTF-8||en_US.UTF-8||C||en_US.UTF-8||en_US.UTF-8
attached base packages:
- stats
- graphics
- grDevices
- utils
- datasets
- methods
- base
other attached packages:
- corrr(v.0.4.0)
- snakecase(v.0.11.0)
- kableExtra(v.1.1.0)
- pander(v.0.6.3)
- RColorBrewer(v.1.1-2)
- cowplot(v.1.0.0)
- knitr(v.1.25)
- janitor(v.1.2.0)
- forcats(v.0.4.0)
- stringr(v.1.4.0)
- dplyr(v.0.8.5)
- purrr(v.0.3.3)
- readr(v.1.3.1)
- tidyr(v.1.0.0)
- tibble(v.3.0.1)
- ggplot2(v.3.3.0)
- tidyverse(v.1.2.1)
loaded via a namespace (and not attached):
- tidyselect(v.1.1.0)
- xfun(v.0.10)
- lattice(v.0.20-38)
- splines(v.3.6.0)
- haven(v.2.1.1)
- colorspace(v.1.4-1)
- vctrs(v.0.3.0)
- generics(v.0.0.2)
- htmltools(v.0.4.0)
- viridisLite(v.0.3.0)
- mgcv(v.1.8-29)
- yaml(v.2.2.0)
- rlang(v.0.4.6)
- pillar(v.1.4.3)
- glue(v.1.4.1)
- withr(v.2.1.2)
- modelr(v.0.1.5)
- readxl(v.1.3.1)
- lifecycle(v.0.2.0)
- munsell(v.0.5.0)
- gtable(v.0.3.0)
- cellranger(v.1.1.0)
- rvest(v.0.3.4)
- evaluate(v.0.14)
- labeling(v.0.3)
- fansi(v.0.4.0)
- highr(v.0.8)
- broom(v.0.7.0)
- Rcpp(v.1.0.3)
- scales(v.1.1.0)
- backports(v.1.1.5)
- webshot(v.0.5.1)
- jsonlite(v.1.6)
- farver(v.2.0.1)
- hms(v.0.5.1)
- digest(v.0.6.23)
- stringi(v.1.4.3)
- grid(v.3.6.0)
- cli(v.2.0.0)
- tools(v.3.6.0)
- magrittr(v.1.5)
- crayon(v.1.3.4)
- pkgconfig(v.2.0.3)
- Matrix(v.1.2-17)
- ellipsis(v.0.3.0)
- xml2(v.1.2.2)
- lubridate(v.1.7.4)
- assertthat(v.0.2.1)
- rmarkdown(v.1.16)
- httr(v.1.4.1)
- rstudioapi(v.0.10)
- R6(v.2.4.1)
- nlme(v.3.1-141)
- compiler(v.3.6.0)